Assignment - 01

Author

Sean Kim

Step 1

library(data.table)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.3     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::between()     masks data.table::between()
✖ dplyr::filter()      masks stats::filter()
✖ dplyr::first()       masks data.table::first()
✖ lubridate::hour()    masks data.table::hour()
✖ lubridate::isoweek() masks data.table::isoweek()
✖ dplyr::lag()         masks stats::lag()
✖ dplyr::last()        masks data.table::last()
✖ lubridate::mday()    masks data.table::mday()
✖ lubridate::minute()  masks data.table::minute()
✖ lubridate::month()   masks data.table::month()
✖ lubridate::quarter() masks data.table::quarter()
✖ lubridate::second()  masks data.table::second()
✖ purrr::transpose()   masks data.table::transpose()
✖ lubridate::wday()    masks data.table::wday()
✖ lubridate::week()    masks data.table::week()
✖ lubridate::yday()    masks data.table::yday()
✖ lubridate::year()    masks data.table::year()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
prt.02 <- fread("/Users/seankim/Downloads/ad_viz_plotval_data_2002.csv")
prt.22 <- fread("/Users/seankim/Downloads/ad_viz_plotval_data_2022.csv")

Dimensions:

dim(prt.02)
[1] 15976    20
dim(prt.22)
[1] 56140    20

2002 data has 20 variables and 15976 observations. 2022 data has 20 variables and 56140 observations.

Headers and Footers:

head(prt.02)
         Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
1: 01/05/2002    AQS 60010007   1                           25.1 ug/m3 LC
2: 01/06/2002    AQS 60010007   1                           31.6 ug/m3 LC
3: 01/08/2002    AQS 60010007   1                           21.4 ug/m3 LC
4: 01/11/2002    AQS 60010007   1                           25.9 ug/m3 LC
5: 01/14/2002    AQS 60010007   1                           34.5 ug/m3 LC
6: 01/17/2002    AQS 60010007   1                           41.0 ug/m3 LC
   DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1:              78 Livermore               1              100
2:              92 Livermore               1              100
3:              71 Livermore               1              100
4:              80 Livermore               1              100
5:              98 Livermore               1              100
6:             115 Livermore               1              100
   AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
1:              88101 PM2.5 - Local Conditions     41860
2:              88101 PM2.5 - Local Conditions     41860
3:              88101 PM2.5 - Local Conditions     41860
4:              88101 PM2.5 - Local Conditions     41860
5:              88101 PM2.5 - Local Conditions     41860
6:              88101 PM2.5 - Local Conditions     41860
                           CBSA_NAME STATE_CODE      STATE COUNTY_CODE  COUNTY
1: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
2: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
3: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
4: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
5: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
6: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
   SITE_LATITUDE SITE_LONGITUDE
1:      37.68753      -121.7842
2:      37.68753      -121.7842
3:      37.68753      -121.7842
4:      37.68753      -121.7842
5:      37.68753      -121.7842
6:      37.68753      -121.7842
head(prt.22)
         Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
1: 01/01/2022    AQS 60010007   3                           12.7 ug/m3 LC
2: 01/02/2022    AQS 60010007   3                           13.9 ug/m3 LC
3: 01/03/2022    AQS 60010007   3                            7.1 ug/m3 LC
4: 01/04/2022    AQS 60010007   3                            3.7 ug/m3 LC
5: 01/05/2022    AQS 60010007   3                            4.2 ug/m3 LC
6: 01/06/2022    AQS 60010007   3                            3.8 ug/m3 LC
   DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1:              52 Livermore               1              100
2:              55 Livermore               1              100
3:              30 Livermore               1              100
4:              15 Livermore               1              100
5:              18 Livermore               1              100
6:              16 Livermore               1              100
   AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
1:              88101 PM2.5 - Local Conditions     41860
2:              88101 PM2.5 - Local Conditions     41860
3:              88101 PM2.5 - Local Conditions     41860
4:              88101 PM2.5 - Local Conditions     41860
5:              88101 PM2.5 - Local Conditions     41860
6:              88101 PM2.5 - Local Conditions     41860
                           CBSA_NAME STATE_CODE      STATE COUNTY_CODE  COUNTY
1: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
2: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
3: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
4: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
5: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
6: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
   SITE_LATITUDE SITE_LONGITUDE
1:      37.68753      -121.7842
2:      37.68753      -121.7842
3:      37.68753      -121.7842
4:      37.68753      -121.7842
5:      37.68753      -121.7842
6:      37.68753      -121.7842
tail(prt.02)
         Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
1: 12/10/2002    AQS 61131003   1                             15 ug/m3 LC
2: 12/13/2002    AQS 61131003   1                             15 ug/m3 LC
3: 12/22/2002    AQS 61131003   1                              1 ug/m3 LC
4: 12/25/2002    AQS 61131003   1                             23 ug/m3 LC
5: 12/28/2002    AQS 61131003   1                              5 ug/m3 LC
6: 12/31/2002    AQS 61131003   1                              6 ug/m3 LC
   DAILY_AQI_VALUE            Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1:              57 Woodland-Gibson Road               1              100
2:              57 Woodland-Gibson Road               1              100
3:               4 Woodland-Gibson Road               1              100
4:              74 Woodland-Gibson Road               1              100
5:              21 Woodland-Gibson Road               1              100
6:              25 Woodland-Gibson Road               1              100
   AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
1:              88101 PM2.5 - Local Conditions     40900
2:              88101 PM2.5 - Local Conditions     40900
3:              88101 PM2.5 - Local Conditions     40900
4:              88101 PM2.5 - Local Conditions     40900
5:              88101 PM2.5 - Local Conditions     40900
6:              88101 PM2.5 - Local Conditions     40900
                                 CBSA_NAME STATE_CODE      STATE COUNTY_CODE
1: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
2: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
3: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
4: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
5: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
6: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
   COUNTY SITE_LATITUDE SITE_LONGITUDE
1:   Yolo      38.66121      -121.7327
2:   Yolo      38.66121      -121.7327
3:   Yolo      38.66121      -121.7327
4:   Yolo      38.66121      -121.7327
5:   Yolo      38.66121      -121.7327
6:   Yolo      38.66121      -121.7327
tail(prt.22)
         Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
1: 12/01/2022    AQS 61131003   1                            3.4 ug/m3 LC
2: 12/07/2022    AQS 61131003   1                            3.8 ug/m3 LC
3: 12/13/2022    AQS 61131003   1                            6.0 ug/m3 LC
4: 12/19/2022    AQS 61131003   1                           34.8 ug/m3 LC
5: 12/25/2022    AQS 61131003   1                           23.2 ug/m3 LC
6: 12/31/2022    AQS 61131003   1                            1.0 ug/m3 LC
   DAILY_AQI_VALUE            Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1:              14 Woodland-Gibson Road               1              100
2:              16 Woodland-Gibson Road               1              100
3:              25 Woodland-Gibson Road               1              100
4:              99 Woodland-Gibson Road               1              100
5:              74 Woodland-Gibson Road               1              100
6:               4 Woodland-Gibson Road               1              100
   AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
1:              88101 PM2.5 - Local Conditions     40900
2:              88101 PM2.5 - Local Conditions     40900
3:              88101 PM2.5 - Local Conditions     40900
4:              88101 PM2.5 - Local Conditions     40900
5:              88101 PM2.5 - Local Conditions     40900
6:              88101 PM2.5 - Local Conditions     40900
                                 CBSA_NAME STATE_CODE      STATE COUNTY_CODE
1: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
2: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
3: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
4: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
5: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
6: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
   COUNTY SITE_LATITUDE SITE_LONGITUDE
1:   Yolo      38.66121      -121.7327
2:   Yolo      38.66121      -121.7327
3:   Yolo      38.66121      -121.7327
4:   Yolo      38.66121      -121.7327
5:   Yolo      38.66121      -121.7327
6:   Yolo      38.66121      -121.7327

Variable names and types

str(prt.02)
Classes 'data.table' and 'data.frame':  15976 obs. of  20 variables:
 $ Date                          : chr  "01/05/2002" "01/06/2002" "01/08/2002" "01/11/2002" ...
 $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
 $ Site ID                       : int  60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
 $ POC                           : int  1 1 1 1 1 1 1 1 1 1 ...
 $ Daily Mean PM2.5 Concentration: num  25.1 31.6 21.4 25.9 34.5 41 29.3 15 18.8 37.9 ...
 $ UNITS                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
 $ DAILY_AQI_VALUE               : int  78 92 71 80 98 115 87 57 65 107 ...
 $ Site Name                     : chr  "Livermore" "Livermore" "Livermore" "Livermore" ...
 $ DAILY_OBS_COUNT               : int  1 1 1 1 1 1 1 1 1 1 ...
 $ PERCENT_COMPLETE              : num  100 100 100 100 100 100 100 100 100 100 ...
 $ AQS_PARAMETER_CODE            : int  88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
 $ AQS_PARAMETER_DESC            : chr  "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
 $ CBSA_CODE                     : int  41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
 $ CBSA_NAME                     : chr  "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
 $ STATE_CODE                    : int  6 6 6 6 6 6 6 6 6 6 ...
 $ STATE                         : chr  "California" "California" "California" "California" ...
 $ COUNTY_CODE                   : int  1 1 1 1 1 1 1 1 1 1 ...
 $ COUNTY                        : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
 $ SITE_LATITUDE                 : num  37.7 37.7 37.7 37.7 37.7 ...
 $ SITE_LONGITUDE                : num  -122 -122 -122 -122 -122 ...
 - attr(*, ".internal.selfref")=<externalptr> 
str(prt.22)
Classes 'data.table' and 'data.frame':  56140 obs. of  20 variables:
 $ Date                          : chr  "01/01/2022" "01/02/2022" "01/03/2022" "01/04/2022" ...
 $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
 $ Site ID                       : int  60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
 $ POC                           : int  3 3 3 3 3 3 3 3 3 3 ...
 $ Daily Mean PM2.5 Concentration: num  12.7 13.9 7.1 3.7 4.2 3.8 2.3 6.9 13.6 11.2 ...
 $ UNITS                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
 $ DAILY_AQI_VALUE               : int  52 55 30 15 18 16 10 29 54 47 ...
 $ Site Name                     : chr  "Livermore" "Livermore" "Livermore" "Livermore" ...
 $ DAILY_OBS_COUNT               : int  1 1 1 1 1 1 1 1 1 1 ...
 $ PERCENT_COMPLETE              : num  100 100 100 100 100 100 100 100 100 100 ...
 $ AQS_PARAMETER_CODE            : int  88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
 $ AQS_PARAMETER_DESC            : chr  "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
 $ CBSA_CODE                     : int  41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
 $ CBSA_NAME                     : chr  "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
 $ STATE_CODE                    : int  6 6 6 6 6 6 6 6 6 6 ...
 $ STATE                         : chr  "California" "California" "California" "California" ...
 $ COUNTY_CODE                   : int  1 1 1 1 1 1 1 1 1 1 ...
 $ COUNTY                        : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
 $ SITE_LATITUDE                 : num  37.7 37.7 37.7 37.7 37.7 ...
 $ SITE_LONGITUDE                : num  -122 -122 -122 -122 -122 ...
 - attr(*, ".internal.selfref")=<externalptr> 
summary(prt.02)
     Date              Source             Site ID              POC       
 Length:15976       Length:15976       Min.   :60010007   Min.   :1.000  
 Class :character   Class :character   1st Qu.:60290014   1st Qu.:1.000  
 Mode  :character   Mode  :character   Median :60590007   Median :1.000  
                                       Mean   :60549600   Mean   :1.581  
                                       3rd Qu.:60731002   3rd Qu.:1.000  
                                       Max.   :61131003   Max.   :6.000  
                                                                         
 Daily Mean PM2.5 Concentration    UNITS           DAILY_AQI_VALUE 
 Min.   :  0.00                 Length:15976       Min.   :  0.00  
 1st Qu.:  7.00                 Class :character   1st Qu.: 29.00  
 Median : 12.00                 Mode  :character   Median : 50.00  
 Mean   : 16.12                                    Mean   : 53.68  
 3rd Qu.: 20.50                                    3rd Qu.: 69.00  
 Max.   :104.30                                    Max.   :176.00  
                                                                   
  Site Name         DAILY_OBS_COUNT PERCENT_COMPLETE AQS_PARAMETER_CODE
 Length:15976       Min.   :1       Min.   :100      Min.   :88101     
 Class :character   1st Qu.:1       1st Qu.:100      1st Qu.:88101     
 Mode  :character   Median :1       Median :100      Median :88101     
                    Mean   :1       Mean   :100      Mean   :88215     
                    3rd Qu.:1       3rd Qu.:100      3rd Qu.:88502     
                    Max.   :1       Max.   :100      Max.   :88502     
                                                                       
 AQS_PARAMETER_DESC   CBSA_CODE      CBSA_NAME           STATE_CODE
 Length:15976       Min.   :12540   Length:15976       Min.   :6   
 Class :character   1st Qu.:23420   Class :character   1st Qu.:6   
 Mode  :character   Median :40140   Mode  :character   Median :6   
                    Mean   :33270                      Mean   :6   
                    3rd Qu.:41740                      3rd Qu.:6   
                    Max.   :49700                      Max.   :6   
                    NA's   :929                                    
    STATE            COUNTY_CODE        COUNTY          SITE_LATITUDE  
 Length:15976       Min.   :  1.00   Length:15976       Min.   :32.63  
 Class :character   1st Qu.: 29.00   Class :character   1st Qu.:34.07  
 Mode  :character   Median : 59.00   Mode  :character   Median :35.36  
                    Mean   : 54.78                      Mean   :36.00  
                    3rd Qu.: 73.00                      3rd Qu.:37.77  
                    Max.   :113.00                      Max.   :41.71  
                                                                       
 SITE_LONGITUDE  
 Min.   :-124.2  
 1st Qu.:-121.4  
 Median :-119.1  
 Mean   :-119.4  
 3rd Qu.:-117.9  
 Max.   :-115.5  
                 
summary(prt.22)
     Date              Source             Site ID              POC        
 Length:56140       Length:56140       Min.   :60010007   Min.   : 1.000  
 Class :character   Class :character   1st Qu.:60310004   1st Qu.: 1.000  
 Mode  :character   Mode  :character   Median :60631006   Median : 3.000  
                                       Mean   :60567850   Mean   : 2.549  
                                       3rd Qu.:60750005   3rd Qu.: 3.000  
                                       Max.   :61131003   Max.   :21.000  
                                                                          
 Daily Mean PM2.5 Concentration    UNITS           DAILY_AQI_VALUE 
 Min.   : -2.20                 Length:56140       Min.   :  0.00  
 1st Qu.:  4.20                 Class :character   1st Qu.: 18.00  
 Median :  6.90                 Mode  :character   Median : 29.00  
 Mean   :  8.52                                    Mean   : 32.84  
 3rd Qu.: 10.80                                    3rd Qu.: 45.00  
 Max.   :302.50                                    Max.   :353.00  
                                                                   
  Site Name         DAILY_OBS_COUNT PERCENT_COMPLETE AQS_PARAMETER_CODE
 Length:56140       Min.   :1       Min.   :100      Min.   :88101     
 Class :character   1st Qu.:1       1st Qu.:100      1st Qu.:88101     
 Mode  :character   Median :1       Median :100      Median :88101     
                    Mean   :1       Mean   :100      Mean   :88197     
                    3rd Qu.:1       3rd Qu.:100      3rd Qu.:88101     
                    Max.   :1       Max.   :100      Max.   :88502     
                                                                       
 AQS_PARAMETER_DESC   CBSA_CODE      CBSA_NAME           STATE_CODE
 Length:56140       Min.   :12540   Length:56140       Min.   :6   
 Class :character   1st Qu.:31080   Class :character   1st Qu.:6   
 Mode  :character   Median :40140   Mode  :character   Median :6   
                    Mean   :35340                      Mean   :6   
                    3rd Qu.:41860                      3rd Qu.:6   
                    Max.   :49700                      Max.   :6   
                    NA's   :4199                                   
    STATE            COUNTY_CODE        COUNTY          SITE_LATITUDE  
 Length:56140       Min.   :  1.00   Length:56140       Min.   :32.58  
 Class :character   1st Qu.: 31.00   Class :character   1st Qu.:34.14  
 Mode  :character   Median : 63.00   Mode  :character   Median :36.50  
                    Mean   : 56.64                      Mean   :36.33  
                    3rd Qu.: 75.00                      3rd Qu.:37.97  
                    Max.   :113.00                      Max.   :41.76  
                                                                       
 SITE_LONGITUDE  
 Min.   :-124.2  
 1st Qu.:-121.5  
 Median :-119.7  
 Mean   :-119.7  
 3rd Qu.:-118.1  
 Max.   :-115.5  
                 
sum(is.na(prt.02))
[1] 929
sum(is.na(prt.22))
[1] 4199

Checking data: In summary, there was variable that had implausible values - that was Daily Mean PM2.5 Concentrations, which had values <0. To remove these implausible key values, I filtered to only include

summary(prt.02$`Daily Mean PM2.5 Concentration`)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00    7.00   12.00   16.12   20.50  104.30 
summary(prt.22$`Daily Mean PM2.5 Concentration`)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  -2.20    4.20    6.90    8.52   10.80  302.50 

It doesn’t make sense for a minimum of -2.2 PM2.5 concentration, so I will subset prt.22 to include data for PM2.5 >= 0. There were 929 NA’s in 2002 data and 4199 NA’s in 2022 data, all of which were in the “CBSA_CODE” category.

prt.22 <- prt.22['Daily Mean PM2.5 Concentration' >= 0, ]

Step 2: Combine Data

data_combined <- rbindlist(list(
  prt.02[, year := 2002], 
  prt.22[, year := 2022]))
setnames(data_combined, c("Daily Mean PM2.5 Concentration", "SITE_LATITUDE", "SITE_LONGITUDE", "Site Name"), c("PM2.5", "Lat", "Lon", "SiteName"))
head(data_combined)
         Date Source  Site ID POC PM2.5    UNITS DAILY_AQI_VALUE  SiteName
1: 01/05/2002    AQS 60010007   1  25.1 ug/m3 LC              78 Livermore
2: 01/06/2002    AQS 60010007   1  31.6 ug/m3 LC              92 Livermore
3: 01/08/2002    AQS 60010007   1  21.4 ug/m3 LC              71 Livermore
4: 01/11/2002    AQS 60010007   1  25.9 ug/m3 LC              80 Livermore
5: 01/14/2002    AQS 60010007   1  34.5 ug/m3 LC              98 Livermore
6: 01/17/2002    AQS 60010007   1  41.0 ug/m3 LC             115 Livermore
   DAILY_OBS_COUNT PERCENT_COMPLETE AQS_PARAMETER_CODE       AQS_PARAMETER_DESC
1:               1              100              88101 PM2.5 - Local Conditions
2:               1              100              88101 PM2.5 - Local Conditions
3:               1              100              88101 PM2.5 - Local Conditions
4:               1              100              88101 PM2.5 - Local Conditions
5:               1              100              88101 PM2.5 - Local Conditions
6:               1              100              88101 PM2.5 - Local Conditions
   CBSA_CODE                         CBSA_NAME STATE_CODE      STATE
1:     41860 San Francisco-Oakland-Hayward, CA          6 California
2:     41860 San Francisco-Oakland-Hayward, CA          6 California
3:     41860 San Francisco-Oakland-Hayward, CA          6 California
4:     41860 San Francisco-Oakland-Hayward, CA          6 California
5:     41860 San Francisco-Oakland-Hayward, CA          6 California
6:     41860 San Francisco-Oakland-Hayward, CA          6 California
   COUNTY_CODE  COUNTY      Lat       Lon year
1:           1 Alameda 37.68753 -121.7842 2002
2:           1 Alameda 37.68753 -121.7842 2002
3:           1 Alameda 37.68753 -121.7842 2002
4:           1 Alameda 37.68753 -121.7842 2002
5:           1 Alameda 37.68753 -121.7842 2002
6:           1 Alameda 37.68753 -121.7842 2002

Step 3: Basic Map

library(leaflet)
leaflet(data_combined) %>% 
  addTiles() %>% 
  addCircleMarkers(
    lng = ~Lon,
    lat = ~Lat,
    radius = 1, 
    color = ~ifelse(year == 2002, "red", "yellow"), 
    weight = 2, 
    opacity = 0.1,
    popup = ~SiteName, 
    label = "Map of Sites Measured in 2002(red) and 2022 (yellow)")

Markers are highly concentrated in the major regions of California - Sacramento, Bay Area, and Los Angeles/San Diego. There is also significant coverage of the rest of the state, with distributed sites all over. There appear to be more sites in 2022 compared to 2002, due to many more yellow markers present compared to the red.

Step 4: Checking for missing/implausible values of PM2.5 in combined dataset.

sum(is.na(data_combined$PM))
[1] 0
head(data_combined$PM2.5)
[1] 25.1 31.6 21.4 25.9 34.5 41.0
tail(data_combined$PM2.5)
[1]  3.4  3.8  6.0 34.8 23.2  1.0
summary(data_combined$PM2.5)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   -2.2     4.5     7.7    10.2    12.4   302.5 

Data has been cleared of implausible/missing values. Observations with NA or implausible PM2.5 have been removed from the dataset. Implausible values for PM2.5 <0 were in the original dataset only for 2022.

Step 5: 3 different spatial levels for comparing daily concentrations of PM2.5 in CA from 2002 to 2022.

State-wide Data:

library(ggplot2)

average_pm_by_year <- data_combined %>%
  group_by(year) %>%
  summarize(
    Average_PM = mean(PM2.5, na.rm = TRUE),
    SD_PM = sd(PM2.5, na.rm = TRUE)
  )

ggplot(average_pm_by_year, aes(x = as.factor(year), y = Average_PM)) +
  geom_bar(stat = "identity", fill = "blue") +
  geom_errorbar(
    aes(ymin = Average_PM - SD_PM, ymax = Average_PM + SD_PM),
    width = 0.2, 
    position = position_dodge(width = 0.9)) + 
  labs(title = "Average PM2.5 Level in California by Year (2002-2022)", x = "Year", y = "Average PM2.5 Level") 

t_test_state <- t.test(prt.02$`Daily Mean PM2.5 Concentration`, prt.22$`Daily Mean PM2.5 Concentration`, paired = FALSE)
t_test_state

    Welch Two Sample t-test

data:  prt.02$`Daily Mean PM2.5 Concentration` and prt.22$`Daily Mean PM2.5 Concentration`
t = 66.435, df = 18799, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 7.371444 7.819640
sample estimates:
mean of x mean of y 
16.115943  8.520401 

On the state-wide level, there was a statistically significant decrease in mean daily PM2.5 concentration from 2002 to 2022.

p-value < 2.2e-16

County-wide data

average_pm_by_county_02 <- data_combined[data_combined$year == 2002, ] %>%
  group_by(COUNTY) %>% 
  summarize(
    Average_PM_2002 = mean(PM2.5, na.rm = TRUE),
    SD_PM_2002 = sd(PM2.5, na.rm = TRUE),
    Year = mean(year),
    Lat = mean(Lat), 
    Lon = mean(Lon))

average_pm_by_county_22 <- data_combined[data_combined$year == 2022, ] %>%
  group_by(COUNTY) %>% 
  summarize(
    Average_PM_2022 = mean(PM2.5, na.rm = TRUE),
    SD_PM_2022 = sd(PM2.5, na.rm = TRUE), 
    Year = mean(year),
    Lat = mean(Lat), 
    Lon = mean(Lon)) 

County_mean <- rbindlist(list(
  average_pm_by_county_02, 
  average_pm_by_county_22))
Column 2 ['Average_PM_2022'] of item 2 is missing in item 1. Use fill=TRUE to fill with NA (NULL for list columns), or use.names=FALSE to ignore column names. use.names='check' (default from v1.12.2) emits this message and proceeds as if use.names=FALSE for  backwards compatibility. See news item 5 in v1.12.2 for options to control this message.
color_palette <- colorNumeric(
  palette = "viridis",  
  domain = County_mean$Average_PM_2002
)

temp.pal02 <- colorNumeric(c('darkgreen','goldenrod','brown'), domain=average_pm_by_county_02$Average_PM_2002)

PMmap02 <- leaflet(average_pm_by_county_02) %>% 
  addProviderTiles('CartoDB.Positron') %>% 
  addCircles(
    lat = ~Lat, lng=~Lon,
    label = ~paste0(round(average_pm_by_county_02$Average_PM_2002,2), ' PM2.5'), color = ~ temp.pal02(average_pm_by_county_02$Average_PM_2002),
    opacity = 1, fillOpacity = 1, radius = 500
    ) %>%
  addLegend('bottomleft', pal=temp.pal02, values=average_pm_by_county_02$Average_PM_2002,
          title='Mean Concentrations PM2.5 in 2002', opacity=1)
PMmap02
temp.pal22 <- colorNumeric(c('darkgreen','goldenrod','brown'), domain=average_pm_by_county_22$Average_PM_2022)

PMmap22 <- leaflet(average_pm_by_county_22) %>% 
  addProviderTiles('CartoDB.Positron') %>% 
  addCircles(
    lat = ~Lat, lng=~Lon,
    label = ~paste0(round(average_pm_by_county_22$Average_PM_2022,2), ' PM2.5'), color = ~ temp.pal22(average_pm_by_county_22$Average_PM_2022),
    opacity = 1, fillOpacity = 1, radius = 500
    ) %>%
  addLegend('bottomleft', pal=temp.pal22, values=average_pm_by_county_22$Average_PM_2022,
          title='Mean Concentrations PM2.5 in 2022', opacity=1)
PMmap22
t_test_county <- t.test(average_pm_by_county_02$Average_PM_2002, average_pm_by_county_22$Average_PM_2022, paired = FALSE)
t_test_county

    Welch Two Sample t-test

data:  average_pm_by_county_02$Average_PM_2002 and average_pm_by_county_22$Average_PM_2022
t = 4.8759, df = 60.659, p-value = 8.163e-06
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 2.615847 6.253725
sample estimates:
mean of x mean of y 
12.239128  7.804342 

There was a statistically significnat decrease in the daily average PM2.5 concentrations by county from 2002 to 2022.

p-value = 8.163e-06

Site-specific data

average_pm_by_site_02 <- data_combined[data_combined$year == 2002, ] %>%
  group_by(SiteName) %>% 
  summarize(
    Average_PM_2002_site = mean(PM2.5, na.rm = TRUE),
    SD_PM_2002_site = sd(PM2.5, na.rm = TRUE),
    Year = mean(year),
    Lat = mean(Lat), 
    Lon = mean(Lon))

average_pm_by_site_22 <- data_combined[data_combined$year == 2022, ] %>%
  group_by(SiteName) %>% 
  summarize(
    Average_PM_2022_site = mean(PM2.5, na.rm = TRUE),
    SD_PM_2022_site = sd(PM2.5, na.rm = TRUE), 
    Year = mean(year),
    Lat = mean(Lat), 
    Lon = mean(Lon)) 

Site_mean <- rbindlist(list(
  average_pm_by_site_02, 
  average_pm_by_site_22))
Column 2 ['Average_PM_2022_site'] of item 2 is missing in item 1. Use fill=TRUE to fill with NA (NULL for list columns), or use.names=FALSE to ignore column names. use.names='check' (default from v1.12.2) emits this message and proceeds as if use.names=FALSE for  backwards compatibility. See news item 5 in v1.12.2 for options to control this message.
color_palette <- colorNumeric(
  palette = "viridis",  
  domain = Site_mean$Average_PM_2002_site
)

temp.pal02.s <- colorNumeric(c('darkgreen','goldenrod','brown'), domain=average_pm_by_site_02$Average_PM_2002_site)

PMmap02.s <- leaflet(average_pm_by_site_02) %>% 
  addProviderTiles('CartoDB.Positron') %>% 
  addCircles(
    lat = ~Lat, lng=~Lon,
    label = ~paste0(round(average_pm_by_site_02$Average_PM_2002_site,2), ' PM2.5'), color = ~ temp.pal02.s(average_pm_by_site_02$Average_PM_2002_site),
    opacity = 1, fillOpacity = 1, radius = 500
    ) %>%
  addLegend('bottomleft', pal=temp.pal02.s, values=average_pm_by_site_02$Average_PM_2002_site,
          title='Mean Concentrations PM2.5 by site in 2002', opacity=1)
PMmap02.s
temp.pal22.s <- colorNumeric(c('darkgreen','goldenrod','brown'), domain=average_pm_by_site_22$Average_PM_2022_site)

PMmap22.s <- leaflet(average_pm_by_site_22) %>% 
  addProviderTiles('CartoDB.Positron') %>% 
  addCircles(
    lat = ~Lat, lng=~Lon,
    label = ~paste0(round(average_pm_by_site_22$Average_PM_2022_site,2), ' PM2.5'), color = ~ temp.pal22.s(average_pm_by_site_22$Average_PM_2022_site),
    opacity = 1, fillOpacity = 1, radius = 500
    ) %>%
  addLegend('bottomleft', pal=temp.pal22.s, values=average_pm_by_site_22$Average_PM_2022_site,
          title='Mean Concentrations PM2.5 by site in 2022', opacity=1)
PMmap22.s
t_test_site <- t.test(average_pm_by_site_02$Average_PM_2002_site, average_pm_by_site_22$Average_PM_2022_site, paired = FALSE)
t_test_site

    Welch Two Sample t-test

data:  average_pm_by_site_02$Average_PM_2002_site and average_pm_by_site_22$Average_PM_2022_site
t = 7.2763, df = 126.86, p-value = 3.156e-11
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 3.754900 6.560148
sample estimates:
mean of x mean of y 
13.211227  8.053703 

There was a statistically significant decrease in the daily average PM2.5 concentrations by site from 2002 to 2022.

p-value = 3.156e-11